insurance <- read_delim("data/raw/HealthInsurance.csv")
## Rows: 8802 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): health, limit, gender, insurance, married, selfemp, region, ethnici...
## dbl (3): rownames, age, family
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table(insurance$education)
##
## bachelor ged highschool master none other phd
## 1549 374 4434 524 1119 667 135
#Просмотр датасета
#view(insurance)
#Проверка датасета на наличие отсутствующих данных
#colSums(is.na(insurance))
#Смена типа переменных с character на factor
insurance <- insurance %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(education = education %>% factor(levels = c("none", "ged", "highschool", "bachelor", "master", "phd", "other")))
#Переименовывание переменных на русский язык
# insurance <- insurance %>%
# rename(
# `Идентификационный номер` = rownames,
# `Статус здоровья` = health,
# `Возраст` = age,
# `Ограничения` = limit,
# `Пол` = gender,
# `Медицинская страховка` = insurance,
# `Семейный статус: замужем / женат` = married,
# `Самозанятость` = selfemp,
# `Количество членов семьи` = family,
# `Регион проживания` = region,
# `Национальность` = ethnicity,
# `Образование` = education
# )
#Присваивание лейблов
#Описательные статистики для категориальнх переменных
#summary(insurance)
output_for_categorical_variables <- insurance %>%
select(where(is.factor)) %>% #Выбираем только качественные данные
pivot_longer(cols = everything(), names_to = "variable", values_to = "category") %>%
group_by(variable, category) %>% #Группируем данные
summarise(count = n(), .groups = "drop") %>%
group_by(variable) %>%
mutate(total = sum(count),
percentage = round(count / total * 100, 2)) %>%
#ungroup() %>%
select(variable, category, count, percentage) %>%
rename(
Признак = variable,
Группа = category,
`Количество людей` = count,
`Процент от общего количества` = percentage
) %>%
flextable() %>%
theme_apa() %>%
merge_v(j=1) %>%
autofit()
output_for_categorical_variables
Признак | Группа | Количество людей | Процент от общего количества |
|---|---|---|---|
education | other | 667 | 7.58 |
none | 1,119 | 12.71 | |
ged | 374 | 4.25 | |
highschool | 4,434 | 50.37 | |
bachelor | 1,549 | 17.60 | |
master | 524 | 5.95 | |
phd | 135 | 1.53 | |
ethnicity | afam | 1,083 | 12.30 |
cauc | 7,354 | 83.55 | |
other | 365 | 4.15 | |
gender | female | 4,169 | 47.36 |
male | 4,633 | 52.64 | |
health | no | 629 | 7.15 |
yes | 8,173 | 92.85 | |
insurance | no | 1,750 | 19.88 |
yes | 7,052 | 80.12 | |
limit | no | 7,571 | 86.01 |
yes | 1,231 | 13.99 | |
married | no | 3,369 | 38.28 |
yes | 5,433 | 61.72 | |
region | midwest | 2,023 | 22.98 |
northeast | 1,682 | 19.11 | |
south | 3,075 | 34.94 | |
west | 2,022 | 22.97 | |
selfemp | no | 7,731 | 87.83 |
yes | 1,071 | 12.17 |
#Описательные статистики для количественных переменных
output_for_numerical_variables <- insurance %>%
select("age", "family") %>% #Выбираем только количественные данные
summarise(across(everything(),
list(
mean = ~ round(mean(.x), 2), #Округляем до сотых
median = ~ median(.x),
sd = ~ round(sd(.x), 2), #Округляем до сотых
min = ~ min(.x),
max = ~ max(.x),
q1 = ~ quantile(.x, 0.25),
q3 = ~ quantile(.x, 0.75)
), .names = "{col}_{fn}")) %>% #Считаем статистики, выводим их в формате "имя переменной_название статистики"
pivot_longer(everything(), names_to = "variable_statistics", values_to = "value") %>% #Переводим данные в длинный формат, чтобы затем разделить первый столбик на два отдельных с именем переменной и названием статистики
separate(variable_statistics, into = c("variable", "statistics"), sep = "_") %>%
pivot_wider(names_from = statistics, values_from = value) %>% #возвращаем всё в широкий формат, чтобы данные были сгруппированы по имени переменной
rename(
`Переменная` = variable,
`Среднее значение` = mean,
`Медиана` = median,
`Стандартное отклонение` = sd,
`Минимальное значение` = min,
`Максимальное значение` = max,
`Квантиль 1` = q1,
`Квантиль 3` = q3
) %>%
flextable() %>%
theme_apa()
output_for_numerical_variables
Переменная | Среднее значение | Медиана | Стандартное отклонение | Минимальное значение | Максимальное значение | Квантиль 1 | Квантиль 3 |
|---|---|---|---|---|---|---|---|
age | 38.94 | 39.00 | 11.11 | 18.00 | 62.00 | 30.00 | 48.00 |
family | 3.09 | 3.00 | 1.56 | 1.00 | 14.00 | 2.00 | 4.00 |
theme_plots <- theme_classic() +
theme(plot.title = element_text(size = 23, hjust = 0.5),
plot.subtitle = element_text(size = 20, hjust = 0.5),
axis.title = element_text(size = 20),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 18),
legend.title = element_text(size = 20),
legend.text = element_text(size = 18),
strip.text = element_text(size = 20))
theme_set(theme_plots)
# barplots function
barplot_fn <- function(col) {
label <- rlang::englue("Barplot of {{col}} variable\nwith proportion")
insurance %>%
ggplot() +
geom_bar(aes(x = "", fill = {{col}}),
position = "fill", width = 0.5) +
labs(title = label,
y = "Proportion",
x = rlang::englue("{{col}}")) +
scale_fill_brewer(name = rlang::englue("{{col}}"), palette = "RdYlBu")
}
col_list <- insurance %>%
select(where(is.factor)) %>%
names() %>%
syms()
bar_plots <- purrr::map(col_list,
~ barplot_fn(!!.x))
wrap_plots(bar_plots, ncol = 1)
hist_fn <- function(col, width) {
ggplot(insurance, aes(x = {{col}})) +
geom_boxplot(
alpha = 0.5,
width = width,
linewidth = 1.5,
outlier.size = 2
) +
geom_histogram(
fill = "mediumpurple3",
alpha = 0.5,
colour = "black",
bins = 25
) +
labs(title = rlang::englue("Histogram of the distribution of variable {{col}}\nand boxplot with median and IQR"),
y = "Quantity",
x = rlang::englue("{{col}}")) +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 25),
axis.text.y = element_text(size = 23),
axis.text.x = element_text(size = 23),
legend.title = element_text(size = 23),
legend.text = element_text(size = 23))
}
plot1 <- hist_fn(age, 100)
plot2 <- hist_fn(family, 400)
plot1 / plot2
# barplots function
barplot_fn_ins <- function(col) {
label <- rlang::englue("Barplot of {{col}} variable\nwith proportion by insurance")
insurance %>%
ggplot() +
geom_bar(aes(x = insurance, fill = {{col}}),
position = "fill", width = 0.5) +
labs(title = label,
y = "proportion",
x = "insurance") +
scale_fill_brewer(name = rlang::englue("{{col}}"), palette = "RdYlBu")
}
col_list <- insurance %>%
select(where(is.factor), -insurance) %>%
names() %>%
syms()
bar_plots_ins <- purrr::map(col_list,
~ barplot_fn_ins(!!.x))
wrap_plots(bar_plots_ins, ncol = 1)
hist_fn <- function(col, width) {
ggplot(insurance, aes(x = {{col}}, fill = insurance)) +
geom_boxplot(
alpha = 0.5,
width = width,
linewidth = 1.5,
outlier.size = 2,
show.legend = FALSE
) +
geom_histogram(
alpha = 0.5,
colour = "black",
bins = 25
) +
facet_wrap(~insurance) +
labs(title = rlang::englue("Histogram of the distribution of variable {{col}}\nand boxplot with median and IQR"),
subtitle = "by insurance",
y = "Quantity",
x = rlang::englue("{{col}}")) +
scale_fill_brewer(name = rlang::englue("{{col}}"), palette = "RdYlBu") +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 25),
axis.text.y = element_text(size = 23),
axis.text.x = element_text(size = 23),
legend.title = element_text(size = 23),
legend.text = element_text(size = 23))
}
plot1 <- hist_fn(age, 100)
plot2 <- hist_fn(family, 400)
plot1 / plot2
ggplot(data = insurance, aes(x = age, fill = insurance)) +
geom_density(alpha = 0.5, adjust = 1.2) +
theme_minimal()
ggplot(insurance, aes(x = insurance, y = age, fill = insurance)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, outlier.shape = NA, color = "black") +
labs(
title = "Возраст в зависимости по наличию страховки: violin + box",
x = "Наличие страховки",
y = "Возраст (лет)"
) +
theme_minimal() +
theme(legend.position = "none")
sum_age <- insurance %>%
group_by(insurance) %>%
summarise(
n = sum(!is.na(age)),
mean = mean(age, na.rm = TRUE),
sd = sd(age, na.rm = TRUE),
se = sd/sqrt(n),
t = qt(0.975, df = n - 1),
low = mean - t*se,
up = mean + t*se,
.groups = "drop"
)
ggplot(sum_age, aes(x = insurance, y = mean)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = low, ymax = up), width = 0.15) +
labs(title = "Среднее ± 95% ДИ: возраст",
x = "Страховка", y = "Возраст (лет)") +
theme_minimal()
# мера ассоциации - разница средних
# инструмент оценки стат. значимости - t- тест
# Выполняем t-критерий для независимых выборок: возраст в зависимости от наличия страховки
t_test_age <- t.test(age ~ insurance, data = insurance, var.equal = TRUE)
# Выводим результаты
cat("### t-критерий для независимых выборок: Возраст в зависимости от наличия страховки\n")
## ### t-критерий для независимых выборок: Возраст в зависимости от наличия страховки
print(t_test_age)
##
## Two Sample t-test
##
## data: age by insurance
## t = -14.332, df = 8800, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -4.779139 -3.629096
## sample estimates:
## mean in group no mean in group yes
## 35.56857 39.77269
ggplot(data = insurance, aes(x = family, fill = insurance)) +
geom_density(alpha = 0.5, adjust = 1.2) +
theme_minimal()
ggplot(data = insurance, aes(x = family, fill = insurance)) +
geom_bar(alpha = 0.5, adjust = 1.2) +
theme_minimal()
## Warning in geom_bar(alpha = 0.5, adjust = 1.2): Ignoring unknown parameters:
## `adjust`
# Выполняем t-критерий для независимых выборок: размер семьи в зависимости от наличия страховки
t_test_family <- t.test(family ~ insurance, data = insurance, var.equal = TRUE)
# Выводим результаты в консоль
cat("### t-критерий для независимых выборок: размер семьи в зависимости от наличия страховки")
## ### t-критерий для независимых выборок: размер семьи в зависимости от наличия страховки
print(t_test_family)
##
## Two Sample t-test
##
## data: family by insurance
## t = 7.15, df = 8800, p-value = 9.369e-13
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 0.2155537 0.3783868
## sample estimates:
## mean in group no mean in group yes
## 3.331429 3.034458
insurance %>% filter (gender == "male" & insurance == "yes") %>% count()
## # A tibble: 1 × 1
## n
## <int>
## 1 3598
library(epitools)
## Warning: пакет 'epitools' был собран под R версии 4.5.2
cont_table_2x2 <- matrix(
c(3598, 1035, # male: yes, no
3454, 715), # female: yes, no
nrow = 2, byrow = TRUE,
dimnames = list(
sex = c("male", "female"),
insurance = c("yes", "no")
)
)
# RR (male vs female)
riskratio(cont_table_2x2, rev = "neither")
## $data
## insurance
## sex yes no Total
## male 3598 1035 4633
## female 3454 715 4169
## Total 7052 1750 8802
##
## $measure
## risk ratio with 95% C.I.
## sex estimate lower upper
## male 1.0000000 NA NA
## female 0.7677081 0.7047005 0.8363492
##
## $p.value
## two-sided
## sex midp.exact fisher.exact chi.square
## male NA NA NA
## female 1.022507e-09 1.042677e-09 1.123433e-09
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
# хи-квадрат и доли рисков по строкам
chisq.test(cont_table_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 36.773, df = 1, p-value = 1.327e-09
round(prop.table(cont_table_2x2, 1), 3)
## insurance
## sex yes no
## male 0.777 0.223
## female 0.828 0.172
# альтернативно: можно напрямую передать в chisq.test()
chisq.test(insurance$insurance, insurance$gender)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: insurance$insurance and insurance$gender
## X-squared = 36.773, df = 1, p-value = 1.327e-09
library(epitools)
# фиксируем порядок уровней
insurance$married <- factor(insurance$married, levels = c("yes","no"))
insurance$insurance <- factor(insurance$insurance, levels = c("yes","no"))
# частоты
tab <- table(insurance$married, insurance$insurance)
# матрица 2x2 в формате (rows: married yes/no; cols: insurance yes/no)
cont_table_2x2 <- matrix(
c(tab["yes","yes"], tab["yes","no"],
tab["no","yes"], tab["no","no"]),
nrow = 2, byrow = TRUE,
dimnames = list(
married = c("yes", "no"),
insurance = c("yes", "no")
)
)
cont_table_2x2
## insurance
## married yes no
## yes 4659 774
## no 2393 976
# RR (married vs not married)
riskratio(cont_table_2x2, rev = "neither")
## $data
## insurance
## married yes no Total
## yes 4659 774 5433
## no 2393 976 3369
## Total 7052 1750 8802
##
## $measure
## risk ratio with 95% C.I.
## married estimate lower upper
## yes 1.000000 NA NA
## no 2.033516 1.869725 2.211655
##
## $p.value
## two-sided
## married midp.exact fisher.exact chi.square
## yes NA NA NA
## no 0 5.86554e-62 1.654726e-63
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
# χ² и доли по строкам
chisq.test(cont_table_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 282.09, df = 1, p-value < 2.2e-16
round(prop.table(cont_table_2x2, 1), 3)
## insurance
## married yes no
## yes 0.858 0.142
## no 0.710 0.290
# Создаем таблицу сопряженности
cont_table_2x2 <- table(insurance$insurance, insurance$selfemp)
print(cont_table_2x2)
##
## no yes
## yes 6314 738
## no 1417 333
# H0: признаки независимы
# H1: признаки зависимы
chisq_2x2 <- chisq.test(cont_table_2x2)
print(chisq_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 95.407, df = 1, p-value < 2.2e-16
library(epitools)
# фиксируем порядок уровней
insurance$selfemp <- factor(insurance$selfemp, levels = c("yes","no"))
insurance$insurance <- factor(insurance$insurance, levels = c("yes","no"))
# частоты и матрица 2×2:
tab <- table(insurance$selfemp, insurance$insurance)
cont_table_2x2 <- matrix(
c(tab["yes","yes"], tab["yes","no"],
tab["no","yes"], tab["no","no"]),
nrow = 2, byrow = TRUE,
dimnames = list(
selfemp = c("yes","no"),
insurance = c("yes","no")
)
)
print(cont_table_2x2)
## insurance
## selfemp yes no
## yes 738 333
## no 6314 1417
# RR (self-employed vs not self-employed)
riskratio(cont_table_2x2, rev = "neither")
## $data
## insurance
## selfemp yes no Total
## yes 738 333 1071
## no 6314 1417 7731
## Total 7052 1750 8802
##
## $measure
## risk ratio with 95% C.I.
## selfemp estimate lower upper
## yes 1.000000 NA NA
## no 0.589494 0.5329629 0.6520214
##
## $p.value
## two-sided
## selfemp midp.exact fisher.exact chi.square
## yes NA NA NA
## no 0 8.742405e-21 1.035021e-22
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
# χ²-тест и доли по строкам
chisq.test(cont_table_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 95.407, df = 1, p-value < 2.2e-16
round(prop.table(cont_table_2x2, 1), 3)
## insurance
## selfemp yes no
## yes 0.689 0.311
## no 0.817 0.183
cont_table_2x2 <- table(insurance$insurance, insurance$health)
print(cont_table_2x2)
##
## no yes
## yes 458 6594
## no 171 1579
# H0: признаки независимы
# H1: признаки зависимы
chisq_2x2 <- chisq.test(cont_table_2x2)
print(chisq_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 22.197, df = 1, p-value = 2.46e-06
# Создаем таблицу сопряженности
cont_table_2x2 <- table(insurance$insurance, insurance$limit)
print(cont_table_2x2)
##
## no yes
## yes 6052 1000
## no 1519 231
# H0: признаки независимы
# H1: признаки зависимы
chisq_2x2 <- chisq.test(cont_table_2x2)
print(chisq_2x2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table_2x2
## X-squared = 1.0402, df = 1, p-value = 0.3078
# Создаем таблицу сопряженности
cont_table <- table(insurance$insurance, insurance$education)
print(cont_table)
##
## none ged highschool bachelor master phd other
## yes 601 265 3606 1382 491 127 580
## no 518 109 828 167 33 8 87
# H0: признаки независимы
# H1: признаки зависимы
chisq_res <- chisq.test(cont_table)
print(chisq_res)
##
## Pearson's Chi-squared test
##
## data: cont_table
## X-squared = 691.5, df = 6, p-value < 2.2e-16
# Создаем таблицу сопряженности
cont_table <- table(insurance$insurance, insurance$region)
print(cont_table)
##
## midwest northeast south west
## yes 1727 1399 2402 1524
## no 296 283 673 498
# H0: признаки независимы
# H1: признаки зависимы
chisq_res <- chisq.test(cont_table)
print(chisq_res)
##
## Pearson's Chi-squared test
##
## data: cont_table
## X-squared = 81.234, df = 3, p-value < 2.2e-16
# Создаем таблицу сопряженности
cont_table <- table(insurance$insurance, insurance$ethnicity)
print(cont_table)
##
## afam cauc other
## yes 824 5953 275
## no 259 1401 90
# H0: признаки независимы
# H1: признаки зависимы
chisq_res <- chisq.test(cont_table)
print(chisq_res)
##
## Pearson's Chi-squared test
##
## data: cont_table
## X-squared = 19.474, df = 2, p-value = 5.906e-05